# Load packages
library(here)
library(tidyverse)
library(gt)
library(e1071)
library(scales)
library(corrplot)
library(caret)
library(randomForest)
library(glmnet)
library(gbm)
# Read in data
data  <- read_csv(here("inputs//data_prep.csv"))
data <- data %>%
  mutate(room_type = as.factor(room_type), license = as.factor(license))

Introduction

For this analysis we’re explore the listings data of Airbnb rentals in Toronto. The data can be found at link

We’re primarily interested in the rental price however we’ll explore the entire dataset for anything interesting and visualize the results. The analysis will conclude with a model for predicting rental prices.

We’ve previously removed all listings with no reviews, we also remove all listings with no availability within the next year as these are likely no longer actively being rented and removed NA columns from the data.

Data Exploration

We’ll generate some initial summary statistics of the various predictors to get started:

# Check Summary Statistics
head(data)

Some initial things to note from the summary statistics is that the vast majority of listings(67.7%) are entire homes or apartments as opposed to shared living spaces. Private rooms make up the bulk of the remainder at 31.5%.

data %>%
  select(room_type) %>%
  group_by(room_type) %>%
  summarize(listings = n()) %>%
  ungroup() %>%
  mutate(room_type = fct_reorder(room_type, listings)) %>%
  ggplot(aes(x = room_type, y = listings, fill = room_type)) +
  geom_bar(stat = "identity") + 
  theme(legend.position = "none") +
  xlab("Room Type") +
  ylab("Listings")

Like any other residential property the neighbourhood is a likely predictor of the price of the property being rented so next be look at the distribution of

Lets now look at the distribution of rental prices:

ggplot(data, aes(x = price)) +
  geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Airbnb Rental Prices",
       x = "Price",
       y = "Count")

I’ve identified the major outlier in this case to be this listing. To get a better idea we can check for example how many listings are there above a price of $5000?

dim(data %>%
  filter(price > 5000))[1]
## [1] 3

We find there are only 13 listings. We can then look at a boxplot of the remaining listings after removing the ones above 5000.

data %>%
  filter(price < 5000) %>%
  ggplot(aes(x = price)) +
  geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
  scale_x_continuous(breaks = c(0,1000,2000, 3000, 4000, 5000)) +
  ylim(-4,4) + 
  stat_boxplot(geom ='errorbar') +
  theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank())

The data seems to get much more spread out above a price of $1500, so we’ll focus in on those data points:

data %>%
  filter(price < 1500) %>%
  ggplot(aes(x = price)) +
  geom_boxplot(color = "blue", outlier.color = "red", outlier.size = 2) +
  scale_x_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
  ylim(-4,4) + 
  stat_boxplot(geom ='errorbar') +
  theme(axis.title.y = element_blank(),
              axis.text.y = element_blank(),
              axis.ticks.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank())

This seems like a better representation of most of the price data for the listings.

As with any property value we may expect things like the number of bedrooms in the listing to be a predictor of price. Larger properties should have higher bedroom counts and thus be more expensive to rent:

data %>%
  filter(price < 1500) %>%
  mutate(bedrooms = as.factor(bedrooms)) %>%
  select(price, bedrooms) %>%
  group_by(bedrooms) %>%
  summarize(mean_price = mean(price)) %>%
  ungroup() %>%
  ggplot(aes(x = bedrooms, y = mean_price, fill = bedrooms)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(breaks = seq(from = 0, to = 1500, by = 250)) +
  theme(legend.position = "none") +
  xlab("# of Bedrooms") +
  ylab("Mean Price")

There appears to be only 1 listing with 9 bedrooms.

data %>%
  filter(price < 1500 & bedrooms == 9)

Because the price is so wildly off from the overall trend I think it’s best to remove this point in the model.

Next it seems likely review scores should be a strong predictor of prices:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = review_scores_rating, y = price)) +
    geom_point(alpha = 0.5, color = "#43a2ca") +
    geom_smooth(method = "loess", se = FALSE, color = "red") +
  labs(title = "Price vs. Review Scores Rating",
       x = "Review Scores",
       y = "Price")

We can similarly do some quick visualizations to see the relationships among other predictors and the price variable:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = minimum_nights)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Minimun Nights",
       x = "Nights",
       y = "Count")

Clearly we have a pretty wide tail to the right on this distribution, suggesting that there are again a few extreme outliers in this data.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  select(minimum_nights) %>%
  group_by(minimum_nights) %>%
  summarize(n = n()) %>%
  arrange(desc(n))
data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & minimum_nights > 1000)

Interestingly by far the most common minimum nights selection on AirBNB is full month stays. Followed by 1,2,3 nights. The outliers we’re seeing in minimum nights largely seem to be inactive listings, with last reviews here for example being from 2016. We would like to remove listings like this but instead of making a cut off for minimum nights we will seek to do this by last review date. This ensures that the pricing we’re seeing in the data is currently active listings.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5) %>%
  ggplot(aes(x = last_review)) + geom_histogram(binwidth = 10, fill = "blue", color = "black") +
  labs(title = "Distribution of Latest Review Date",
       x = "Last Review Date",
       y = "Count")

As we can see the vast majority of these listings are recent, however we are getting a stretch of listings dating back all the way to 2015. For the sake of ensuring the pricing is accurate especially considering the price disturbances caused during Covid, we’ll remove listings who have not received a review after 2020.

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01')

Next we’ll check the relationship of the number of reviews for a listing and it’s price

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  ggplot(aes(x = number_of_reviews, y = price)) + geom_point( color = "blue") +
  labs(title = "Number of Reviews vs Price",
       x = "Number of Reviews",
       y = "Price")

There doesn’t appear to be any correlation here but we may the listings with very high review counts make this data difficult to see, we can visualize this again with a limit on the number of reviews:

data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01' & number_of_reviews < 180) %>%
  ggplot(aes(x = number_of_reviews, y = price)) + geom_point( color = "blue") +
  labs(title = "Number of Reviews vs Price",
       x = "Number of Reviews",
       y = "Price")

No obvious correlation seems to be present in the relationship between the number of reviews and the price of the listing.

We should also check for linear correlations between predictors, we can visualize this via a correlation matrix

df <- data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  select(price, minimum_nights, number_of_reviews, review_scores_rating, calculated_host_listings_count)

corrplot(cor(as.matrix(df),method = "spearman"),
                 method = "color",
                  tl.cex = 0.9,
                 number.cex = 0.95,
                 addCoef.col = "black")

None of these values seem to be particularly large so it seems we can safely assume that there is no multicolinearity present in these predictors.

Model Selection

df <- data %>%
  filter(price < 1500 & bedrooms < 9 & number_of_reviews > 5 & last_review >= '2020-01-01') %>%
  select(id, longitude, latitude, price, room_type, minimum_nights, number_of_reviews, last_review, reviews_per_month, availability_365, bedrooms, calculated_host_listings_count,
         number_of_reviews_ltm) %>% drop_na()

We would like to find a model that outperforms linear regression trained using all predictors. This is the simplest model we could employ, though we found no evidence that the predictors have linear relationships with price it will serve as a good baseline to assess the performance of other models which are more computationally expensive to train.

set.seed(1)

train_data <- df %>%
  sample_frac(0.70)

test_data <- anti_join(df, train_data, by = 'id')

# Train model
mod.lm <- lm(price ~ .-id, data = train_data)

# Test model on test set
predictions <- predict(mod.lm, test_data, type = "response")

# Calculating RMSE and MAE
mae <- mean(abs(test_data$price - predictions))
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 71.10984
rmse <- sqrt(mean((test_data$price - predictions)^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 117.7678

Now we may want to test some other model techniques namely ones that contain some sort of feature selection. To do that we will use random forest and lasso regression. Ideally these models will produce lower error rates then our standard linear model as they should capture some of the non linear relationships we observed in the data exploration phase.

set.seed(1)

# Split data train/test

train_data <- df %>%
  sample_frac(0.70)

test_data <- anti_join(df, train_data, by = 'id')

# Defining Control Parameters
control_rfe <- rfeControl(functions = rfFuncs,
                          method = "cv",
                          number = 5,
                          verbose = FALSE)

# Defining Predictors
predictors <- names(train_data[, !(names(train_data) %in% c("price"))])

# Run RFE
rfe_result <- rfe(train_data[, predictors],
                  train_data$price,
                  sizes = c(1:10),
                  rfeControl = control_rfe)
print(rfe_result)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (5 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared   MAE RMSESD RsquaredSD MAESD Selected
##          1 108.90   0.3315 73.98  3.025    0.03220 1.076         
##          2 106.47   0.3715 69.76  3.854    0.02159 1.750         
##          3 104.21   0.4009 66.81  4.457    0.03229 2.554         
##          4 100.29   0.4530 62.84  4.656    0.03683 2.166         
##          5  99.41   0.4657 61.61  4.582    0.03300 2.300         
##          6  95.23   0.4881 57.10  2.969    0.01019 1.194         
##          7  94.61   0.4958 56.77  4.019    0.02127 2.033         
##          8  92.77   0.5179 55.29  4.085    0.02067 1.900         
##          9  91.93   0.5242 54.83  3.686    0.01954 1.381         
##         10  91.76   0.5273 54.65  3.462    0.02065 1.394         
##         12  91.04   0.5354 53.91  3.676    0.02047 1.941        *
## 
## The top 5 variables (out of 12):
##    bedrooms, room_type, minimum_nights, latitude, number_of_reviews_ltm

The top 5 of 10 predictors are bedrooms, minimum_nights, room_type, latitude and longitude

set.seed(1)

best_features <- predictors[rfe_result$optVariables]

# Train the final model with the selected features
final_model <- randomForest(price ~ ., data = train_data[, c("price", rfe_result$optVariables)])

# Evaluate the model performance on the test dataset
predictions_1 <- predict(final_model, newdata = test_data[, rfe_result$optVariables])
performance <- postResample(predictions_1, test_data$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 104.4002474   0.5593081  58.2315611
library(ggplot2)

pred_vs_true <- data.frame(Predicted = predictions_1, True = test_data$price)
ggplot(pred_vs_true, aes(x = True, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "Random Forest Model: Predicted vs. True Prices",
       x = "True Prices",
       y = "Predicted Prices") +
  theme_minimal()

As expected random forest performs better then the standard linear regression. Now we build a lasso regression model.

set.seed(1)

# Creating model matrix of predictors for Lasso fucntion
X <- model.matrix(price ~ . -1, data = train_data)
X.test <- model.matrix(price ~ . -1, data = test_data)
Y <- train_data$price
Y.test <- test_data$price

# Fit LASSO model
lasso.cv <- cv.glmnet(X, Y, alpha = 1, nfolds = 5)
# Finding optimal lambda
optimal <- lasso.cv$lambda.min
# Fit the optimal lambda model
lasso.mod <- glmnet(X, Y, alpha = 1, lambda = optimal)

# Make predictions on new test data
lasso.pred <- predict(lasso.mod, optimal, newx = X.test)

# Compute error
mae <- mean(abs(lasso.pred-Y.test))
cat("Mean Absolute Error (MAE):", mae, "\n")
## Mean Absolute Error (MAE): 70.63944
rmse <- sqrt(mean((lasso.pred-Y.test)^2))
cat("Root Mean Squared Error (RMSE):", rmse, "\n")
## Root Mean Squared Error (RMSE): 117.3701

In this case we find the lasso not to be a significant improvement over the linear regression model. This is most likely caused by lasso not capturing non linear relationships which are captured in random forest.

The next method we will assess is gradient boosting machines

set.seed(1)

gbm_model <- gbm(price ~ .-last_review,data = train_data,  distribution = "gaussian",
                 n.trees = 1000,
                 interaction.depth = 3,
                 shrinkage = 0.01,
                 n.minobsinnode = 10,
                 cv.folds = 5,
                 verbose = FALSE)

# Determine the optimal tree number
optimal_trees <- gbm.perf(gbm_model, method = "cv")

# Compute error rates
predictions_2 <- predict(gbm_model, newdata = test_data, n.trees = optimal_trees)
performance <- postResample(predictions_2, test_data$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 105.0167686   0.5511735  59.4993410
library(ggplot2)

pred_vs_true <- data.frame(Predicted = predictions_2, True = test_data$price)
ggplot(pred_vs_true, aes(x = True, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "GBM Model: Predicted vs. True Prices",
       x = "True Prices",
       y = "Predicted Prices") +
  theme_minimal()

Gradient boosting produces slightly worse error rates then the random forest method. Next we will build a support vector machine model, to do this we will normalize the features so they are all using the same scale.

set.seed(1)

# Normalizing train and test split
preprocess_params <- preProcess(train_data[, -which(names(train_data) == "price")], method = c("center", "scale"))

train_data_scaled <- predict(preprocess_params, train_data)
test_data_scaled <- predict(preprocess_params, test_data)
test_data_scaled$price <- test_data$price

# Training model with svm()
svm_model <- svm(price ~ .,
                 data = train_data_scaled,
                 kernel = "radial",
                 cost = 1,
                 gamma = 0.1,
                 epsilon = 0.1)

# Computing error rates
predictions <- predict(svm_model, newdata = test_data_scaled)
performance <- postResample(predictions, test_data_scaled$price)
print(performance)
##        RMSE    Rsquared         MAE 
## 110.1921828   0.5280257  58.3953594

The SVM performs worse then the random forest and the GBM methods. However it is worth noting it is a faster training model for not much worse performance.

library(ggplot2)

pred_vs_true <- data.frame(Predicted = predictions, True = test_data_scaled$price)
ggplot(pred_vs_true, aes(x = True, y = Predicted)) +
  geom_point() +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "SVM Model: Predicted vs. True Prices",
       x = "True Prices",
       y = "Predicted Prices") +
  theme_minimal()

SVM shows a 20% improvement in Mean absolute error over our baseline linear regression, and a 7% improvement in Root mean square error.

However our best performing model overall is the random forest, performing 12% better in RMSE and roughly 20% better in MAE.